My super title
Posted on Tue 21 April 2020 in yeah
These interactive plots are generated by Bokeh
import math print 'import done'
Plotting Covid-19 Cases according to Zipcodes using Bokeh
Import the required modules
import tabula import requests import datetime import geopandas as gpd import pandas as pd import json from dateutil import parser from bokeh.io import reset_output, output_notebook, show from bokeh.plotting import figure from bokeh.models import Div, Column, Row #make bokeh output to notebook reset_output() output_notebook()
Download the Covid-19 case data from San Diego County
url = 'https://www.sandiegocounty.gov/content/dam/sdc/hhsa/programs/phs/Epidemiology/COVID-19%20Summary%20of%20Cases%20by%20Zip%20Code.pdf' pdf= requests.get(url) with open(f'covid19_in_sd_{datetime.datetime.now().date()}.pdf','wb') as f: f.write(pdf.content)
Check if the pdf is downloaded
!ls *.pdf
covid19_in_sd_2020-04-18.pdf covid19_in_sd_2020-04-20.pdf covid19_in_sd_2020-04-19.pdf covid19_in_sd_2020-04-21.pdf
Read the pdf and clean up the data
def tabula_convert_pdf_to_df(pdf): raw_data = tabula.read_pdf(pdf,stream=True,pages=1)[0] title = tabula.read_pdf(pdf,pages=1)[0].columns[0].split('\r')[-1] dates = [] for _ in title.split(): try: dates.append(parser.parse(_, fuzzy=True)) except Exception as e: pass date = dates[0] # no use now updated_time = datetime.datetime.combine(dates[1],datetime.time(dates[2].hour)) # no use now df= pd.DataFrame({ 'ZipCode' : pd.concat([raw_data['Zip Code'].astype(str), raw_data['Zip Code.1'].astype(str)]), 'CaseCount': pd.concat([raw_data['Unnamed: 0'],raw_data['Count.1'] ]) })[:-2] df['CaseCount'] = df['CaseCount'].astype('int16') total_count = sum(df['CaseCount']) return title,total_count, df title, total_count, count_data = tabula_convert_pdf_to_df('covid19_in_sd_2020-04-21.pdf')
Get the geojson file of communities
Later in this notebook, I want to plot the data on a map base on zip code geometry. I used the export geojson from https://data.sandiegocounty.gov/Maps-and-Geographical-Resources/Zip-Codes/vsuf-uefy to get the geojson file. In addition to the zip code geometry, it also has the name of the community it belongs to.
county_gpd = gpd.read_file(f'Sandiego_Zip_codes.geojson') county_gpd.head()
| community | shape_star | shape_stle | zip | geometry | |
|---|---|---|---|---|---|
| 0 | Alpine | 4149939944.16 | 326045.262676 | 91901 | MULTIPOLYGON (((-116.74539 32.96063, -116.7408... |
| 1 | Bonita | 273909416.836 | 113257.374615 | 91902 | MULTIPOLYGON (((-116.97172 32.70838, -116.9712... |
| 2 | Boulevard | 2735681408.51 | 241725.552214 | 91905 | MULTIPOLYGON (((-116.23165 32.75083, -116.2280... |
| 3 | Campo | 3066759065.62 | 287410.325075 | 91906 | MULTIPOLYGON (((-116.35677 32.70460, -116.3572... |
| 4 | Chula Vista | 403437442.009 | 112587.791814 | 91910 | MULTIPOLYGON (((-117.06354 32.65011, -117.0634... |
I then merge geojson zip code geometry data with case count data. I do a 'how =right' merge with the zip code as the common key. All the right rows (rows in case count per zip code) will be preserved.
merged =county_gpd.merge(count_data, right_on = 'ZipCode', left_on = 'zip', how = 'right').drop(columns=['zip']).rename(columns={'community':'CommunityName'}) merged.head()
| CommunityName | shape_star | shape_stle | geometry | ZipCode | CaseCount | |
|---|---|---|---|---|---|---|
| 0 | Alpine | 4149939944.16 | 326045.262676 | MULTIPOLYGON (((-116.74539 32.96063, -116.7408... | 91901 | 2 |
| 1 | Bonita | 273909416.836 | 113257.374615 | MULTIPOLYGON (((-116.97172 32.70838, -116.9712... | 91902 | 18 |
| 2 | Boulevard | 2735681408.51 | 241725.552214 | MULTIPOLYGON (((-116.23165 32.75083, -116.2280... | 91905 | 2 |
| 3 | Chula Vista | 403437442.009 | 112587.791814 | MULTIPOLYGON (((-117.06354 32.65011, -117.0634... | 91910 | 70 |
| 4 | Chula Vista | 329043951.316 | 93108.6951441 | MULTIPOLYGON (((-117.04641 32.62846, -117.0463... | 91911 | 98 |
Plot the case counts as function of zip code and community
nogeo_data = merged[['ZipCode', 'CaseCount', 'CommunityName']].fillna('Unknown')
Split the communities into three parts to plot them seperately.
import numpy as np split0 = nogeo_data.loc[nogeo_data['CommunityName'] == 'San Diego'] ## Split the all other remaining split1, split2 = np.array_split(nogeo_data.loc[nogeo_data['CommunityName'] != 'San Diego'], 2) # make groupby to create bokeh nested x range plots sandiego = split0.groupby(by=['CommunityName', 'ZipCode']) part1 = split1.groupby(by=['CommunityName', 'ZipCode']) part2 = split2.groupby(by=['CommunityName', 'ZipCode'])
def create_plot(df): p = figure(plot_height=250, plot_width=800, x_range=df, toolbar_location=None, tooltips=[("CaseCount", "@CaseCount_mean"), ("Community Name, ZipCode", "@CommunityName_ZipCode")] ) p.vbar(x='CommunityName_ZipCode', top='CaseCount_mean', width=1, source=df, line_color="white" ) p.y_range.start = 0 p.x_range.range_padding = 0.05 p.xgrid.grid_line_color = None p.yaxis.axis_label = "Case Count" p.xaxis.major_label_orientation = 22/28 p.xaxis.group_label_orientation = 22/28 p.xaxis.major_label_text_font_size = "8pt" p.xaxis.group_text_font_size = "10pt" p.title.text_font_size = "16pt" p.yaxis.axis_label_text_font_size = "16pt" p.xaxis.axis_label_text_font_size = "16pt" p.yaxis.axis_label_text_font_size = "16pt" p.outline_line_color = None p.x_range.group_padding = 1.0 return p
bar_chart = Column( create_plot(sandiego), create_plot(part1), create_plot(part2)) # groupby are not sorted by bokeh. seems like a bug
Chloropleth Map of Covid 19 cases according to Zip Code in San Diego
# Convert merged data to json. Because there are some case counts from # "unknown" they dont have any geo information. merged_json = json.loads(merged.dropna().to_json()) # Convert to json to str like object because bokeh needs it in this form. json_data = json.dumps(merged_json)
from bokeh.models import GeoJSONDataSource, LinearColorMapper, ColorBar, HoverTool from bokeh.palettes import brewer geosource = GeoJSONDataSource(geojson = json.dumps(merged_json)) #Define a sequential multi-hue color palette. palette = brewer['YlOrRd'][8] #Reverse color order palette = palette[::-1] #max(merged['CaseCount']) #Instantiate LinearColorMapper that linearly maps numbers in a range, into a sequence of colors. color_mapper = LinearColorMapper(palette = palette, low = 0, high = 80 ) tick_labels = {0:"0", 10:"10", 20:"20", 30:"30", 40:"40", 50:"50", 60:"60", 70:"70", 80:">90" } #Create color bar. color_bar = ColorBar(color_mapper=color_mapper, label_standoff=8, width = 500, height = 20,border_line_color=None, location = (0,0), orientation = 'horizontal', major_label_overrides = tick_labels) #Add hover tool hover = HoverTool(tooltips = [ ('CaseCount','@CaseCount'), ('Zip Code', '@ZipCode'), ('Community Name', '@CommunityName')] ) #Create figure object. map_figure = figure( x_axis_location=None, y_axis_location=None, plot_height = 1000 , plot_width = 950, toolbar_location = None, tools = [hover]) map_figure.xgrid.grid_line_color = None map_figure.ygrid.grid_line_color = None #Add patch renderer to figure. map_figure.patches('xs','ys', source = geosource, fill_color = {'field' :'CaseCount', 'transform' : color_mapper}, line_color = 'black', line_width = 0.25, fill_alpha = 1) map_figure.title.text_font_size = '16pt' #Specify figure layout. map_figure.add_layout(color_bar, 'above') map_collage = Column( Div(text = title, style={'font-size': '200%', 'color': 'blue'}), Div(text = f'Total number of cases: {total_count}',style={'font-size': '200%', 'color': 'red'}), map_figure) #show(map_figure)
collage = Column(map_collage, bar_chart) show(collage)
# Generate standlone html documents with the collage of both plots from bokeh.resources import CDN from bokeh.embed import file_html try: html1 = file_html(collage, CDN, 'Covid19 Cases in San Diego with Bokeh') except Exception as e: print(e) with open('Covid19_apr20.html','w') as f: f.write(html1)